---
author: Marcus Rohrmoser
categories:
- en
date: 2019-07-14 16:32:23+02:00
tags:
- iOS
- Swift
- libxml2
- ShaarliOS
- HTML
title: 'Parse HTML&nbsp;forms with 120&nbsp;LOC of swift'
type: post
url: /2019/07/swift-libxml2-html/
---

My iOS sharing extension [#Shaarli💫
(sourcecode+tests)](http://mro.name/ShaarliOS) has to communicate with HTML
backends. Scraping data from HTML forms, sending HTTP posts.  Now I want to
avoid parsing HTML myself, but luckily [libxml2
HTMLParser](http://xmlsoft.org/html/libxml-HTMLparser.html) does that and is
available on iOS.  Giants, shoulders, you know.

Also I want to avoid to hold the complete document in memory, let alone
traverse a DOM when I can grab the few things I need from a [sax
parser](http://www.saxproject.org/) while they fly by. I used to get the form data with
C/Objective-C and xpath, OMG. Having as few own, non-Swift-managed
code as possible for parsing arbitrary content from the internet seems
desirable.

So `libxml2` for html parsing, sax (streaming or push parser) for slim
footprint and Swift for memory safety and 'modern' idioms
([Closures!](https://docs.swift.org/swift-book/LanguageGuide/Closures.html)).


Let's add `libxml2` sax parsing mojo to the Xcode project, following [The Red
Queen Coder](http://redqueencoder.com/wrapping-libxml2-for-swift/):

* in build settings look out for "Header Search Paths" and add  
`$(SDKROOT)/usr/include/libxml2`
* to "Other Linker Flags."  
`-lxml2`
* to "Objective-C Bridging Header" a `foo/Bridging.h`, see below.

The actual parsing finally turned out simpler than expected.

Especially the closures wiring the sax callbacks in line 52 look nice, don't
they?

Happy parsing!

----

## Addendum

### `Bridging.h`

<pre class="line-numbers"><code class="language-swift">//
// Bridging header to access libxml2 html parsing from Swift.
// http://mro.name/ShaarliOS
//
// First adjust some settings as described by http://redqueencoder.com/wrapping-libxml2-for-swift/
//
// * add to Xcode build settings “Header Search Paths”:
//     $(SDKROOT)/usr/include/libxml2
// * add to Xcode build settings "Other Linker Flags."
//     -lxml2
//
// Also: https://github.com/SonoPlot/Swift-libxml/blob/master/LibXMLWrapperExample/LibXMLWrapperExample/Bridging-Header.h

#import &lt;libxml/HTMLparser.h>
#import &lt;libxml/xmlerror.h>
</code></pre>

### `HtmlFormParser.swift`

<pre class="line-numbers"><code class="language-swift">//
//  HtmlFormParser.swift
//  http://mro.name/ShaarliOS
//
//  Created by Marcus Rohrmoser on 09.06.19.
//  Copyright © 2019 Marcus Rohrmoser mobile Software. All rights reserved.
//

import Foundation

typealias HtmlFormDict = [String:String]

// uses libxml2 graceful html parsing
func findHtmlForms(_ body:Data?, _ encoding:String?) -> [String:HtmlFormDict] {
    return HtmlFormParser().parse(body)
}

// turn a nil-terminated list of unwrapped name,value pairs into a dictionary.
// expand abbreviated (html5) attribute values.
internal func atts2dict(_ atts: (Int) -> String?) -> HtmlFormDict {
    var ret:HtmlFormDict = [:]
    var idx = 0
    while let name = atts(idx) {
        ret[name] = atts(idx+1) ?? name
        idx += 2
    }
    return ret
}

// https://github.com/apple/swift-corelibs-foundation/blob/master/Foundation/XMLParser.swift#L33
private func decode(_ bytes:UnsafePointer&lt;xmlChar>?) -> String? {
    guard let bytes = bytes else { return nil }
    guard let (str, _) = String.decodeCString(bytes, as:UTF8.self, repairingInvalidCodeUnits:false) else { return nil }
    return str
}

private func me(_ ptr : UnsafeRawPointer?) -> HtmlFormParser {
    return Unmanaged&lt;HtmlFormParser>.fromOpaque(ptr!).takeUnretainedValue()
}

private class HtmlFormParser {
    private var forms : [String:HtmlFormDict] = [:]
    private var form : HtmlFormDict = [:]
    private var formName = ""
    private var textName = ""
    private var text = ""

    func parse(_ data:Data?) -> [String:HtmlFormDict] {
        guard let data = data else { return [:] }
        var sax = htmlSAXHandler()
        sax.initialized = XML_SAX2_MAGIC
        sax.startElement = { me($0).startElement(name:$1, atts:$2) }
        sax.endElement = { me($0).endElement(name:$1) }
        sax.characters = { me($0).charactersFound(ch:$1, len:$2) }
        // handler.error = errorEncounteredSAX

        // https://curl.haxx.se/libcurl/c/htmltitle.html
        // http://xmlsoft.org/html/libxml-HTMLparser.html#htmlParseChunk
        // https://stackoverflow.com/questions/41140050/parsing-large-xml-from-server-while-downloading-with-libxml2-in-swift-3
        // https://github.com/apple/swift-corelibs-foundation/blob/master/Foundation/XMLParser.swift#L524
        // http://redqueencoder.com/wrapping-libxml2-for-swift/ bzw. https://github.com/SonoPlot/Swift-libxml
        let ctxt = htmlCreatePushParserCtxt(&sax, Unmanaged.passUnretained(self).toOpaque(), "", 0, "", XML_CHAR_ENCODING_NONE)
        defer { htmlFreeParserCtxt(ctxt) }
        let _ = data.withUnsafeBytes { htmlParseChunk(ctxt, $0, Int32(data.count), 0) }
        htmlParseChunk(ctxt, "", 0, 1)

        return forms
    }

    private func startElement(name: UnsafePointer&lt;xmlChar>? , atts:UnsafePointer&lt;UnsafePointer&lt;xmlChar>?>?) {
        guard let atts = atts else { return }
        // https://github.com/MaddTheSane/chmox/blob/3263ddf09276f6a47961cc4b87762f58b88772d0/CHMTableOfContents.swift#L75
        guard let nam_ = UnsafeRawPointer(name)?.assumingMemoryBound(to: Int8.self) else { return }
        if 0 != strcmp("form", nam_) && 0 != strcmp("input", nam_) && 0 != strcmp("textarea", nam_) {
            return
        }
        guard let elm = decode(name) else { return }
        let att = atts2dict({ decode(atts[$0]) })
        let nam = att["name"] ?? att["id"] ?? ""
        switch elm {
        case "form":
            formName = nam
            form = [:]
        case "textarea":
            textName = nam
            text = ""
        case "input":
            form[nam] = "checkbox" == att["type"]
                ? ("off" == att["checked"] ? nil : att["checked"])
                : att["value"]
        default:
            break
        }
    }

    private func endElement(name:UnsafePointer&lt;xmlChar>?) {
        // https://github.com/MaddTheSane/chmox/blob/3263ddf09276f6a47961cc4b87762f58b88772d0/CHMTableOfContents.swift#L75
        guard let nam_ = UnsafeRawPointer(name)?.assumingMemoryBound(to: Int8.self) else { return }
        if 0 != strcmp("form", nam_) && 0 != strcmp("input", nam_) && 0 != strcmp("textarea", nam_) {
            return
        }
        let elm = decode(name)
        switch elm {
        case "form":
            forms[formName] = form
            formName = ""
        case "textarea":
            form[textName] = text
            textName = ""
        default:
            break
        }
    }

    private func charactersFound(ch: UnsafePointer&lt;xmlChar>?, len: CInt) {
        if (textName.isEmpty) {
            return
        }
        let d = Data(bytes: ch!, count:Int(len)) // clamp
        let s = String(data: d, encoding: .utf8) ?? "&lt;utf8 decoding issue>"
        text.append(s)
    }
}
</code></pre>

